import os
import plotly.io as pio
import helpsk as hlp
pio.renderers.default='notebook'
def get_project_directory():
return os.getcwd().\
replace('/develop', '').\
replace('/deliver', '').\
replace('/archive', '').\
replace('/code/notebooks', '')
file_name = os.path.join(get_project_directory(), 'artifacts/models/experiments', 'multi-model-BayesSearchCV-2022-03-07-20-09-07.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)
results.best_score
0.7668027026011365
results.best_params
{'model': 'RandomForestClassifier()',
'imputer': 'SimpleImputer()',
'scaler': 'None',
'pca': 'None',
'encoder': 'OneHotEncoder()'}
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | ... | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | scaler | pca | encoder | model_rank | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23 | 1 | 0.767 | 0.720 | 0.814 | RandomForestClassifier() | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | None | None | OneHotEncoder() | 1.0 |
| 5 | 2 | 0.763 | 0.725 | 0.802 | LogisticRegression() | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | StandardScaler() | None | OneHotEncoder() | 1.0 |
| 7 | 3 | 0.761 | 0.720 | 0.803 | LinearSVC() | 0.280746 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | PCA('mle') | OneHotEncoder() | 1.0 |
| 16 | 5 | 0.760 | 0.701 | 0.819 | ExtraTreesClassifier() | NaN | 0.68466 | 30.0 | 1659.0 | 25.0 | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | None | PCA('mle') | OneHotEncoder() | 1.0 |
| 28 | 9 | 0.753 | 0.710 | 0.796 | XGBClassifier() | NaN | NaN | 5.0 | 1246.0 | NaN | ... | 0.95619 | 0.694741 | 0.518639 | 0.242199 | 1.220693 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() | 1.0 |
5 rows × 25 columns
results.to_formatted_dataframe(return_style=True,
include_rank=True,
num_rows=1000)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | learning_rate | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | scaler | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.767 | 0.720 | 0.814 | RandomForestClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 2 | 0.763 | 0.725 | 0.802 | LogisticRegression() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 3 | 0.761 | 0.720 | 0.803 | LinearSVC() | 0.281 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | PCA('mle') | OneHotEncoder() |
| 4 | 0.761 | 0.697 | 0.825 | LogisticRegression() | 0.001 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | MinMaxScaler() | None | OneHotEncoder() |
| 5 | 0.760 | 0.701 | 0.819 | ExtraTreesClassifier() | <NA> | 0.685 | 30.000 | 1,659.000 | 25.000 | 11.000 | 0.781 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 6 | 0.757 | 0.711 | 0.803 | ExtraTreesClassifier() | <NA> | 0.681 | 38.000 | 1,461.000 | 23.000 | 10.000 | 0.553 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | None | CustomOrdinalEncoder() |
| 7 | 0.755 | 0.714 | 0.796 | RandomForestClassifier() | <NA> | 0.599 | 70.000 | 1,858.000 | 39.000 | 22.000 | 0.851 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | None | OneHotEncoder() |
| 8 | 0.753 | 0.716 | 0.791 | RandomForestClassifier() | <NA> | 0.303 | 81.000 | 1,063.000 | 15.000 | 27.000 | 0.502 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 9 | 0.753 | 0.710 | 0.796 | XGBClassifier() | <NA> | <NA> | 5.000 | 1,246.000 | <NA> | <NA> | <NA> | <NA> | 0.023 | 15.000 | 0.956 | 0.695 | 0.519 | 0.242 | 1.221 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 10 | 0.752 | 0.698 | 0.805 | ExtraTreesClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 11 | 0.751 | 0.721 | 0.781 | LinearSVC() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 12 | 0.749 | 0.698 | 0.801 | XGBClassifier() | <NA> | <NA> | 1.000 | 1,974.000 | <NA> | <NA> | <NA> | <NA> | 0.024 | 4.000 | 0.543 | 0.620 | 0.876 | 0.034 | 1.445 | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 13 | 0.749 | 0.706 | 0.792 | ExtraTreesClassifier() | <NA> | 0.408 | 87.000 | 1,423.000 | 25.000 | 19.000 | 0.989 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | PCA('mle') | CustomOrdinalEncoder() |
| 14 | 0.747 | 0.694 | 0.799 | ExtraTreesClassifier() | <NA> | 0.710 | 15.000 | 1,493.000 | 33.000 | 27.000 | 0.914 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | PCA('mle') | OneHotEncoder() |
| 15 | 0.746 | 0.716 | 0.776 | LogisticRegression() | 23.327 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | None | OneHotEncoder() |
| 16 | 0.745 | 0.704 | 0.786 | RandomForestClassifier() | <NA> | 0.762 | 88.000 | 1,235.000 | 8.000 | 7.000 | 0.666 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | CustomOrdinalEncoder() |
| 17 | 0.744 | 0.709 | 0.779 | RandomForestClassifier() | <NA> | 0.567 | 38.000 | 1,060.000 | 19.000 | 41.000 | 0.656 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 18 | 0.742 | 0.686 | 0.798 | XGBClassifier() | <NA> | <NA> | 10.000 | 1,146.000 | <NA> | <NA> | <NA> | <NA> | 0.025 | 14.000 | 0.771 | 0.548 | 0.748 | 0.093 | 1.892 | SimpleImputer(strategy='median') | None | PCA('mle') | OneHotEncoder() |
| 19 | 0.738 | 0.686 | 0.790 | XGBClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 20 | 0.736 | 0.695 | 0.777 | ExtraTreesClassifier() | <NA> | 0.740 | 14.000 | 1,645.000 | 5.000 | 43.000 | 0.741 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | PCA('mle') | CustomOrdinalEncoder() |
| 21 | 0.734 | 0.695 | 0.773 | RandomForestClassifier() | <NA> | 0.770 | 70.000 | 1,570.000 | 16.000 | 39.000 | 0.910 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | None | CustomOrdinalEncoder() |
| 22 | 0.730 | 0.702 | 0.758 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | None | CustomOrdinalEncoder() |
| 23 | 0.727 | 0.690 | 0.765 | LinearSVC() | 0.361 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | MinMaxScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 24 | 0.727 | 0.689 | 0.765 | LinearSVC() | 0.746 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | MinMaxScaler() | None | CustomOrdinalEncoder() |
| 25 | 0.727 | 0.692 | 0.762 | LogisticRegression() | 3.489 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 26 | 0.726 | 0.697 | 0.755 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 27 | 0.717 | 0.676 | 0.758 | XGBClassifier() | <NA> | <NA> | 13.000 | 1,153.000 | <NA> | <NA> | <NA> | <NA> | 0.026 | 3.000 | 0.685 | 0.549 | 0.802 | 0.016 | 2.353 | SimpleImputer() | None | PCA('mle') | CustomOrdinalEncoder() |
| 28 | 0.714 | 0.679 | 0.749 | XGBClassifier() | <NA> | <NA> | 4.000 | 1,181.000 | <NA> | <NA> | <NA> | <NA> | 0.067 | 7.000 | 0.557 | 0.763 | 0.592 | 0.001 | 2.984 | SimpleImputer(strategy='median') | None | PCA('mle') | CustomOrdinalEncoder() |
| 29 | 0.701 | 0.669 | 0.733 | LinearSVC() | 10.021 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | MinMaxScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 30 | 0.660 | 0.610 | 0.710 | LinearSVC() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | None | OneHotEncoder() |
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | imputer | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.767 | 0.720 | 0.814 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | OneHotEncoder() |
| 2 | 0.755 | 0.714 | 0.796 | 0.599 | 70.000 | 1,858.000 | 39.000 | 22.000 | 0.851 | gini | SimpleImputer(strategy='most_frequent') | None | OneHotEncoder() |
| 3 | 0.753 | 0.716 | 0.791 | 0.303 | 81.000 | 1,063.000 | 15.000 | 27.000 | 0.502 | gini | SimpleImputer(strategy='median') | None | OneHotEncoder() |
| 4 | 0.745 | 0.704 | 0.786 | 0.762 | 88.000 | 1,235.000 | 8.000 | 7.000 | 0.666 | gini | SimpleImputer(strategy='median') | PCA('mle') | CustomOrdinalEncoder() |
| 5 | 0.744 | 0.709 | 0.779 | 0.567 | 38.000 | 1,060.000 | 19.000 | 41.000 | 0.656 | entropy | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 6 | 0.734 | 0.695 | 0.773 | 0.770 | 70.000 | 1,570.000 | 16.000 | 39.000 | 0.910 | entropy | SimpleImputer(strategy='most_frequent') | None | CustomOrdinalEncoder() |
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | pca | encoder |
|---|---|---|---|---|---|---|---|---|
| 1 | 0.763 | 0.725 | 0.802 | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 2 | 0.761 | 0.697 | 0.825 | 0.001 | SimpleImputer(strategy='median') | MinMaxScaler() | None | OneHotEncoder() |
| 3 | 0.746 | 0.716 | 0.776 | 23.327 | SimpleImputer(strategy='median') | StandardScaler() | None | OneHotEncoder() |
| 4 | 0.730 | 0.702 | 0.758 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | None | CustomOrdinalEncoder() |
| 5 | 0.727 | 0.692 | 0.762 | 3.489 | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 6 | 0.726 | 0.697 | 0.755 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
results.plot_performance_across_trials(facet_by='model').show()
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
# height=1000, width=1000).show()
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
height=800)
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()
results.plot_score_vs_parameter(
query='model == "RandomForestClassifier()"',
parameter='max_features',
size='max_depth',
color='encoder',
)
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='max_depth'
# )
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='imputer'
# )